##############################################################################
#DATA MANIPULATION AND MATCHING

library(tidyverse)
library(psych)
library(StatMatch)
library(dplyr)

ms<-read.csv("MScomma.csv") # see README.txt file
bf<-read.csv("BFcomma.csv") # see README.txt file

# HARMONIZATION PHASE
bf <- bf %>% rename(ncomp = c_Ncmp_fatto)

ms$rip3 <- case_when(
  ms$ripmf %in% c(1, 2) ~ 1,
  ms$ripmf == 3         ~ 2,
  ms$ripmf %in% c(4, 5) ~ 3
)
bf$rip3 <- case_when(
  bf$rip %in% c(1, 2) ~ 1,
  bf$rip == 3         ~ 2,
  bf$rip %in% c(4, 5) ~ 3
)

bf <- bf %>% rename(c_titstu_1,new_istr)
ms$new_istr <- case_when(
  ms$istrm == "01" ~ 5,
  ms$istrm == "07" ~ 4,
  ms$istrm == "09" ~ 3,
  ms$istrm == "10" ~ 2,
  ms$istrm == "11" ~ 1
)

bf$new_staciv <- case_when(
  bf$staciv_1 == "01" ~ 1,
  bf$staciv_1 == "02" ~ 2,
  bf$staciv_1 == c("03","04","05") ~ 3,
  bf$staciv_1 == "06" ~ 4
)
ms$new_staciv <- case_when(
  bf$stcivm == 1 ~ 1,
  bf$stcivm == 2 ~ 2,
  bf$stcivm == 3 ~ 3,
  bf$stcivm == 6 ~ 4
)

ms$rgn <- ms$rgn/10

bf$new_eta <- case_when(
  bf$c_etacalc_1 == 1 ~ 1,
  bf$c_etacalc_1 == c(2,3) ~ 2,
  bf$c_etacalc_1 == 4 ~ 3,
  bf$c_etacalc_1 == 5 ~ 4,
  bf$c_etacalc_1 == 6 ~ 5,
  bf$c_etacalc_1 == 7 ~ 6,
  bf$c_etacalc_1 == 8 ~ 7,
  bf$c_etacalc_1 == 9 ~ 8,
  bf$c_etacalc_1 == 10 ~ 9,
  bf$c_etacalc_1 == 11 ~ 10,
  bf$c_etacalc_1 == 12 ~ 11,
  bf$c_etacalc_1 == c(13,14) ~ 12,
  bf$c_etacalc_1 == 15 ~ 13
)

ms$new_eta <- case_when(
  ms$etam == c("001","002") ~ 1,
  ms$etam == c("003","004","005","006") ~ 2,
  ms$etam == c("007","008") ~ 3,
  ms$etam == "009" ~ 4,
  ms$etam == "010" ~ 5,
  ms$etam == "011" ~ 6,
  ms$etam == "012" ~ 7,
  ms$etam == "013" ~ 8,
  ms$etam == "014" ~ 9,
  ms$etam == "015" ~ 10,
  ms$etam == "016" ~ 11,
  ms$etam == "017" ~ 12,
  ms$etam == "018" ~ 13
)

ms_h <- ms[!is.na(ms$votovi) & ms$new_eta > 2, ]
bf_h <- bf[bf$new_eta > 2, ]


group <- c("sesso","rip3")
x<-c("new_eta","ncomp","new_istr","new_staciv","Sitecon","Risecon","rgn")
z<-c("votovi","sitec","relam","relfam","salut","temlib","amb","sodlav2","futuasp","idia","idfa")

saveRDS(bf_h,"bf.rds")
saveRDS(ms_h,"ms.rds")
saveRDS(z,"z.rds")
saveRDS(x,"x.rds")
saveRDS(group,"group.rds")

# MATCHING

out.finalsynthetic<-NND.hotdeck(data.rec=readRDS("bf.rds"),data.don=readRDS("ms.rds"),match.vars=readRDS("x.rds"),don.class=readRDS("group.rds"), dist.fun="Gower", constrained=TRUE, constr.alg="Hungarian")

datR<-create.fused(data.rec=readRDS("bf.rds"),data.don=readRDS("ms.rds"), mtc.ids=out.finalsynthetic$mtc.ids,z.vars=readRDS("z.rds"))

#Create spending variables
datR <- datR %>% rename(exp = sp_tot_str_aggr_1)
datR$expA <- datR$d_01
datR$expHT <- datR$d_02
datR$expClo <- datR$d_03
datR$expC <- datR$d_04_str + datR$d_05
datR$expHPC <- datR$d_06_aggr_1 + datR$d108_mensile
datR$expT <- datR$d_07
datR$expComm <- datR$d_08
datR$expL <- datR$d_09
datR$expIstr <- datR$d_10

exp_var <- c("exp","expA", "expHT", "expClo", "expC","expHPC", "expT", "expComm", "expL", "expIstr")

#Apply Carbonaro scale
W_carbonaro <- c("2" = 1.67, "3" = 2.23, "4" = 2.72, "5" = 3.18, "6" = 3.61)

for (var in exp_var) {
  for (i in 1:nrow(datR)) {
    ncomp_i <- as.character(datR$ncomp[i])
    if (ncomp_i %in% names(W_carbonaro)) {
      datR[i, var] <- datR[i, var] / W_carbonaro[ncomp_i]
    }
  }
}

#Create log exp
for (var in exp_var) {
  log_var <- paste0("log", var)
  datR[[log_var]] <- log(datR[[var]])
  datR[[log_var]][is.infinite(datR[[log_var]])] <- NA
}

#Factor from polychoric matrix
datR$sitec14<-(5-datR$sitec)
datR$relfam14<-(5-datR$relfam)
datR$relam14<-(5-datR$relam)
datR$salut14<-(5-datR$salut)
datR$amb14<-(5-datR$amb)
datR$temlib14<-(5-datR$temlib)

vars <- c("sitec14", "relfam14", "relam14", "salut14", "amb14", "temlib14")
poly_result <- polychoric(datR[, vars])
R <- poly_result$rho  
fa_result <- fa(r = R, nfactors = 1, fm = "pa", rotate = "none")
lambda <- as.vector(fa_result$loadings[, 1])
beta <- solve(R) %*% lambda
X <- as.matrix(datR[, vars])
datR$Factor <- as.numeric(X %*% beta)
summary(datR$Factor)

#Control variables
datR$gender<-0
datR$gender[datR$sesso == 2] <- 1

datR$d5edu<-0
datR$d5edu[datR$new_istr == 5] <- 1

datR$midp<-NA
datR$midp[datR$new_eta == 3] <- 21
datR$midp[datR$new_eta == 4] <- 27
datR$midp[datR$new_eta == 5] <- 32
datR$midp[datR$new_eta == 6] <- 37
datR$midp[datR$new_eta == 7] <- 42
datR$midp[datR$new_eta == 8] <- 47
datR$midp[datR$new_eta == 9] <- 52
datR$midp[datR$new_eta == 10] <- 57
datR$midp[datR$new_eta == 11] <- 62
datR$midp[datR$new_eta == 12] <- 69.5
datR$midp[datR$new_eta == 13] <- 85
datR$lmidp<-log(datR$midp)

datR$marital <- 1
datR$marital[datR$new_staciv == 2] <- 2
datR$marital[datR$new_staciv == 3 | datR$new_staciv == 4 | datR$new_staciv == 5] <- 3

datR$risk <- 1
datR$risk[datR$cond_1 == 2 | datR$cond_1 == 3] <- 2
datR$risk[datR$cond_1 == 7] <- 3

datR$ageclass<-1
datR$ageclass[datR$midp == 3 | datR$midp == 4 | datR$midp == 5] <- 2
datR$ageclass[datR$midp == 6 | datR$midp == 7 | datR$midp == 8] <- 3
datR$ageclass[datR$midp == 9 | datR$midp == 10 | datR$midp == 11] <- 4

datR$futuasp14<-(5-datR$futuasp) 
datR$new_se<-(6-datR$Sitecon)

#Instrument
datR$m_geta_r_se_f <- ave(datR$exp, interaction(datR$gender, datR$rip, datR$new_se, datR$futuasp14, datR$ageclass), FUN = mean)
datR$from_sp8 <- datR$exp - datR$m_geta_r_se_f

